# ML for continuous target variable

# 1. Tree Based Models
# 2. Regression
# 3. Neural Networks
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Classification/code/carga_librerias.R')
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/f_partition.R')
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/regression_metrics.R')

whole_data<-f_partition(df=fread('/Users/ssobrinou/IE/Advanced/2019_Advanced/Datasets/data_automobile_ready.csv'),
                        test_proportion = 0.2,
                        seed = 872367823)


str(whole_data)
## List of 2
##  $ train:Classes 'data.table' and 'data.frame':  156 obs. of  31 variables:
##   ..$ fuel_gas          : int [1:156] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : int [1:156] 0 0 0 0 0 0 1 0 0 0 ...
##   ..$ doors_others      : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : int [1:156] 0 1 0 1 1 1 1 1 0 0 ...
##   ..$ body_others       : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : int [1:156] 1 0 0 0 0 0 0 1 1 1 ...
##   ..$ body_wagon        : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ drive_rwd         : int [1:156] 0 0 0 1 1 0 1 1 0 0 ...
##   ..$ engine_loc_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
##   ..$ length            : num [1:156] 167 145 173 176 169 ...
##   ..$ width             : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
##   ..$ height            : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
##   ..$ weight            : int [1:156] 1950 1819 2324 2714 2204 2221 2818 2169 2385 2010 ...
##   ..$ engine_type_others: int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ cyl_others        : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ cyl_six           : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : int [1:156] 91 92 120 146 98 109 156 98 108 92 ...
##   ..$ fuel_sys_idi      : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : int [1:156] 0 0 0 1 0 1 0 0 0 0 ...
##   ..$ fuel_sys_others   : int [1:156] 0 1 0 0 0 0 1 0 0 1 ...
##   ..$ bore              : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
##   ..$ stroke            : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
##   ..$ compr_ratio       : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
##   ..$ hp                : int [1:156] 68 76 97 116 70 90 145 70 82 76 ...
##   ..$ peak_rpm          : int [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
##   ..$ city_mpg          : int [1:156] 31 31 27 24 29 24 19 29 24 30 ...
##   ..$ high_mpg          : int [1:156] 38 38 34 30 34 29 24 34 25 34 ...
##   ..$ price             : int [1:156] 7395 6855 8949 11549 8238 9980 12764 8058 9233 7295 ...
##   ..$ make_agg_toyota   : int [1:156] 0 0 0 1 1 0 0 1 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ test :Classes 'data.table' and 'data.frame':  39 obs. of  31 variables:
##   ..$ fuel_gas          : int [1:39] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_others      : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : int [1:39] 0 0 0 1 1 0 0 1 1 0 ...
##   ..$ body_others       : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : int [1:39] 1 0 1 0 0 0 1 0 0 1 ...
##   ..$ body_wagon        : int [1:39] 0 1 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : int [1:39] 1 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_rwd         : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_loc_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
##   ..$ length            : num [1:39] 177 193 189 141 157 ...
##   ..$ width             : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
##   ..$ height            : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
##   ..$ weight            : int [1:39] 2824 2954 3230 1488 1876 1967 1989 1940 2289 2304 ...
##   ..$ engine_type_others: int [1:39] 0 0 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_others        : int [1:39] 1 1 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_six           : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : int [1:39] 136 136 209 61 90 90 90 92 110 110 ...
##   ..$ fuel_sys_idi      : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : int [1:39] 1 1 1 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_others   : int [1:39] 0 0 0 0 0 0 0 1 1 1 ...
##   ..$ bore              : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
##   ..$ stroke            : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
##   ..$ compr_ratio       : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
##   ..$ hp                : int [1:39] 115 110 182 48 68 68 68 76 86 86 ...
##   ..$ peak_rpm          : int [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
##   ..$ city_mpg          : int [1:39] 18 19 16 47 31 31 31 30 27 27 ...
##   ..$ high_mpg          : int [1:39] 22 25 22 53 38 38 38 34 33 33 ...
##   ..$ price             : int [1:39] 17450 18920 30760 5151 6377 6229 6692 6529 9095 8845 ...
##   ..$ make_agg_toyota   : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr>
whole_data<-lapply(whole_data, function(x){
  return(x[, names(x)[sapply(x,is.integer)]:=lapply(.SD, as.numeric), .SDcols=sapply(x,is.integer)])
})

str(whole_data)
## List of 2
##  $ train:Classes 'data.table' and 'data.frame':  156 obs. of  31 variables:
##   ..$ fuel_gas          : num [1:156] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : num [1:156] 0 0 0 0 0 0 1 0 0 0 ...
##   ..$ doors_others      : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : num [1:156] 0 1 0 1 1 1 1 1 0 0 ...
##   ..$ body_others       : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : num [1:156] 1 0 0 0 0 0 0 1 1 1 ...
##   ..$ body_wagon        : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ drive_rwd         : num [1:156] 0 0 0 1 1 0 1 1 0 0 ...
##   ..$ engine_loc_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
##   ..$ length            : num [1:156] 167 145 173 176 169 ...
##   ..$ width             : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
##   ..$ height            : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
##   ..$ weight            : num [1:156] 1950 1819 2324 2714 2204 ...
##   ..$ engine_type_others: num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ cyl_others        : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ cyl_six           : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : num [1:156] 91 92 120 146 98 109 156 98 108 92 ...
##   ..$ fuel_sys_idi      : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : num [1:156] 0 0 0 1 0 1 0 0 0 0 ...
##   ..$ fuel_sys_others   : num [1:156] 0 1 0 0 0 0 1 0 0 1 ...
##   ..$ bore              : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
##   ..$ stroke            : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
##   ..$ compr_ratio       : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
##   ..$ hp                : num [1:156] 68 76 97 116 70 90 145 70 82 76 ...
##   ..$ peak_rpm          : num [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
##   ..$ city_mpg          : num [1:156] 31 31 27 24 29 24 19 29 24 30 ...
##   ..$ high_mpg          : num [1:156] 38 38 34 30 34 29 24 34 25 34 ...
##   ..$ price             : num [1:156] 7395 6855 8949 11549 8238 ...
##   ..$ make_agg_toyota   : num [1:156] 0 0 0 1 1 0 0 1 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ test :Classes 'data.table' and 'data.frame':  39 obs. of  31 variables:
##   ..$ fuel_gas          : num [1:39] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_others      : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : num [1:39] 0 0 0 1 1 0 0 1 1 0 ...
##   ..$ body_others       : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : num [1:39] 1 0 1 0 0 0 1 0 0 1 ...
##   ..$ body_wagon        : num [1:39] 0 1 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : num [1:39] 1 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_rwd         : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_loc_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
##   ..$ length            : num [1:39] 177 193 189 141 157 ...
##   ..$ width             : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
##   ..$ height            : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
##   ..$ weight            : num [1:39] 2824 2954 3230 1488 1876 ...
##   ..$ engine_type_others: num [1:39] 0 0 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_others        : num [1:39] 1 1 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_six           : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : num [1:39] 136 136 209 61 90 90 90 92 110 110 ...
##   ..$ fuel_sys_idi      : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : num [1:39] 1 1 1 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_others   : num [1:39] 0 0 0 0 0 0 0 1 1 1 ...
##   ..$ bore              : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
##   ..$ stroke            : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
##   ..$ compr_ratio       : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
##   ..$ hp                : num [1:39] 115 110 182 48 68 68 68 76 86 86 ...
##   ..$ peak_rpm          : num [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
##   ..$ city_mpg          : num [1:39] 18 19 16 47 31 31 31 30 27 27 ...
##   ..$ high_mpg          : num [1:39] 22 25 22 53 38 38 38 34 33 33 ...
##   ..$ price             : num [1:39] 17450 18920 30760 5151 6377 ...
##   ..$ make_agg_toyota   : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr>
# we start defining a formula
formula<-as.formula(price~.)   # price against all other variables


#### 1.1 Base R Partitioning Tree 
library(rpart)
library(rpart.plot)
tree_0<-rpart(formula = formula, data = whole_data$train, method = 'anova', model=TRUE)

print(tree_0)
## n= 156 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 156 9507260000 13064.040  
##    2) engine_size< 182 146 3229339000 11427.030  
##      4) weight< 2544 89  442823600  8395.393  
##        8) length< 172.7 63   86829530  7401.762 *
##        9) length>=172.7 26  143078300 10803.040 *
##      5) weight>=2544 57  691334500 16160.630  
##       10) width< 68.6 50  491555400 15542.020  
##         20) hp< 118 27  142082300 13943.780 *
##         21) hp>=118 23  199542200 17418.220  
##           42) stroke>=3.31 10   21436830 14821.100 *
##           43) stroke< 3.31 13   58770460 19416.000 *
##       11) width>=68.6 7   43973520 20579.290 *
##    3) engine_size>=182 10  174348500 36964.500 *
summary(tree_0)
## Call:
## rpart(formula = formula, data = whole_data$train, method = "anova", 
##     model = TRUE)
##   n= 156 
## 
##           CP nsplit  rel error    xerror       xstd
## 1 0.64199066      0 1.00000000 1.0061701 0.19634783
## 2 0.22037697      1 0.35800934 0.4096693 0.05526871
## 3 0.02239507      2 0.13763236 0.1966299 0.03396699
## 4 0.01638806      3 0.11523729 0.1784056 0.03210153
## 5 0.01577015      4 0.09884923 0.1801067 0.02947649
## 6 0.01255198      5 0.08307909 0.1762945 0.02932287
## 7 0.01000000      6 0.07052711 0.1606481 0.02614540
## 
## Variable importance
## engine_size      weight          hp    city_mpg       width      length 
##          22          18          16          16           9           9 
##    high_mpg   drive_rwd 
##           5           4 
## 
## Node number 1: 156 observations,    complexity param=0.6419907
##   mean=13064.04, MSE=6.094397e+07 
##   left son=2 (146 obs) right son=3 (10 obs)
##   Primary splits:
##       engine_size < 182    to the left,  improve=0.6419907, (0 missing)
##       city_mpg    < 17.5   to the right, improve=0.5479412, (0 missing)
##       hp          < 175.5  to the left,  improve=0.5262288, (0 missing)
##       weight      < 2697.5 to the left,  improve=0.5036809, (0 missing)
##       high_mpg    < 28.5   to the right, improve=0.4750277, (0 missing)
##   Surrogate splits:
##       weight   < 3495   to the left,  agree=0.981, adj=0.7, (0 split)
##       hp       < 175.5  to the left,  agree=0.981, adj=0.7, (0 split)
##       city_mpg < 16.5   to the right, agree=0.981, adj=0.7, (0 split)
##       length   < 199.05 to the left,  agree=0.968, adj=0.5, (0 split)
##       width    < 69.25  to the left,  agree=0.968, adj=0.5, (0 split)
## 
## Node number 2: 146 observations,    complexity param=0.220377
##   mean=11427.03, MSE=2.211876e+07 
##   left son=4 (89 obs) right son=5 (57 obs)
##   Primary splits:
##       weight      < 2544   to the left,  improve=0.6487956, (0 missing)
##       high_mpg    < 28.5   to the right, improve=0.5943413, (0 missing)
##       engine_size < 126    to the left,  improve=0.5689015, (0 missing)
##       hp          < 94.5   to the left,  improve=0.5402361, (0 missing)
##       city_mpg    < 23.5   to the right, improve=0.4989328, (0 missing)
##   Surrogate splits:
##       high_mpg    < 28.5   to the right, agree=0.911, adj=0.772, (0 split)
##       engine_size < 126    to the left,  agree=0.897, adj=0.737, (0 split)
##       hp          < 104    to the left,  agree=0.877, adj=0.684, (0 split)
##       city_mpg    < 22     to the right, agree=0.863, adj=0.649, (0 split)
##       drive_rwd   < 0.5    to the left,  agree=0.856, adj=0.632, (0 split)
## 
## Node number 3: 10 observations
##   mean=36964.5, MSE=1.743485e+07 
## 
## Node number 4: 89 observations,    complexity param=0.02239507
##   mean=8395.393, MSE=4975546 
##   left son=8 (63 obs) right son=9 (26 obs)
##   Primary splits:
##       length     < 172.7  to the left,  improve=0.4808139, (0 missing)
##       weight     < 2287.5 to the left,  improve=0.4666525, (0 missing)
##       wheel_base < 98.6   to the left,  improve=0.4051903, (0 missing)
##       width      < 64.5   to the left,  improve=0.4023945, (0 missing)
##       hp         < 83     to the left,  improve=0.3833154, (0 missing)
##   Surrogate splits:
##       wheel_base  < 97.85  to the left,  agree=0.921, adj=0.731, (0 split)
##       weight      < 2301   to the left,  agree=0.910, adj=0.692, (0 split)
##       engine_size < 115.5  to the left,  agree=0.888, adj=0.615, (0 split)
##       width       < 65.55  to the left,  agree=0.876, adj=0.577, (0 split)
##       bore        < 3.29   to the left,  agree=0.831, adj=0.423, (0 split)
## 
## Node number 5: 57 observations,    complexity param=0.01638806
##   mean=16160.63, MSE=1.212868e+07 
##   left son=10 (50 obs) right son=11 (7 obs)
##   Primary splits:
##       width      < 68.6   to the left,  improve=0.2253693, (0 missing)
##       hp         < 118    to the left,  improve=0.2057451, (0 missing)
##       cyl_six    < 0.5    to the left,  improve=0.1899323, (0 missing)
##       wheel_base < 100.8  to the left,  improve=0.1879356, (0 missing)
##       weight     < 2697.5 to the left,  improve=0.1791694, (0 missing)
##   Surrogate splits:
##       wheel_base < 108.55 to the left,  agree=0.895, adj=0.143, (0 split)
##       cyl_others < 0.5    to the left,  agree=0.895, adj=0.143, (0 split)
## 
## Node number 8: 63 observations
##   mean=7401.762, MSE=1378246 
## 
## Node number 9: 26 observations
##   mean=10803.04, MSE=5503013 
## 
## Node number 10: 50 observations,    complexity param=0.01577015
##   mean=15542.02, MSE=9831109 
##   left son=20 (27 obs) right son=21 (23 obs)
##   Primary splits:
##       hp          < 118    to the left,  improve=0.3050132, (0 missing)
##       engine_size < 162.5  to the left,  improve=0.2531597, (0 missing)
##       cyl_six     < 0.5    to the left,  improve=0.2348309, (0 missing)
##       weight      < 2697.5 to the left,  improve=0.1646924, (0 missing)
##       peak_rpm    < 4375   to the right, improve=0.1423281, (0 missing)
##   Surrogate splits:
##       city_mpg    < 20.5   to the right, agree=0.86, adj=0.696, (0 split)
##       engine_size < 154    to the left,  agree=0.84, adj=0.652, (0 split)
##       cyl_six     < 0.5    to the left,  agree=0.76, adj=0.478, (0 split)
##       high_mpg    < 26.5   to the right, agree=0.76, adj=0.478, (0 split)
##       height      < 54.85  to the right, agree=0.74, adj=0.435, (0 split)
## 
## Node number 11: 7 observations
##   mean=20579.29, MSE=6281932 
## 
## Node number 20: 27 observations
##   mean=13943.78, MSE=5262308 
## 
## Node number 21: 23 observations,    complexity param=0.01255198
##   mean=17418.22, MSE=8675749 
##   left son=42 (10 obs) right son=43 (13 obs)
##   Primary splits:
##       stroke      < 3.31   to the right, improve=0.5980435, (0 missing)
##       high_mpg    < 24.5   to the left,  improve=0.3815231, (0 missing)
##       height      < 54.2   to the left,  improve=0.3073280, (0 missing)
##       compr_ratio < 7.65   to the left,  improve=0.2493117, (0 missing)
##       body_sedan  < 0.5    to the left,  improve=0.2394175, (0 missing)
##   Surrogate splits:
##       height          < 54.2   to the left,  agree=0.826, adj=0.6, (0 split)
##       fuel_sys_mpfi   < 0.5    to the left,  agree=0.783, adj=0.5, (0 split)
##       fuel_sys_others < 0.5    to the right, agree=0.783, adj=0.5, (0 split)
##       bore            < 3.29   to the left,  agree=0.783, adj=0.5, (0 split)
##       width           < 66.7   to the left,  agree=0.739, adj=0.4, (0 split)
## 
## Node number 42: 10 observations
##   mean=14821.1, MSE=2143683 
## 
## Node number 43: 13 observations
##   mean=19416, MSE=4520804
rpart.plot(tree_0, digits = 4,type = 2,box.palette = 'Gn')

test_tree<-predict(tree_0, newdata = whole_data$test,type = 'vector')

df_pred<-whole_data$test[, .(id=1:.N,price, test_tree)]
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  3 variables:
##  $ id       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price    : num  17450 18920 30760 5151 6377 ...
##  $ test_tree: num  13944 20579 36964 7402 7402 ...
##  - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red'))

#### 1.2 Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
rf_0<-randomForest(formula=formula, data=whole_data$train)
print(rf_0)
## 
## Call:
##  randomForest(formula = formula, data = whole_data$train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 5619661
##                     % Var explained: 90.78
test_rf<-predict(rf_0, newdata = whole_data$test, type='response')

df_pred<-cbind(df_pred, test_rf)
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  4 variables:
##  $ id       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price    : num  17450 18920 30760 5151 6377 ...
##  $ test_tree: num  13944 20579 36964 7402 7402 ...
##  $ test_rf  : num  16786 18333 34747 6027 6002 ...
##  - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red','blue'))

#### 1.3 Boosting Tree
library(xgboost)

# for this algorithm we need to convert to a matrix first
# 

xgb_0<-xgboost(booster='gbtree',
               data=as.matrix(whole_data$train[, !'price', with=F]),
               label=whole_data$train$price,
               nrounds = 50,
               objective='reg:linear')
## [1]  train-rmse:11027.429688 
## [2]  train-rmse:8065.855469 
## [3]  train-rmse:5963.436523 
## [4]  train-rmse:4451.936523 
## [5]  train-rmse:3378.894043 
## [6]  train-rmse:2617.100586 
## [7]  train-rmse:2044.587769 
## [8]  train-rmse:1629.894897 
## [9]  train-rmse:1320.864014 
## [10] train-rmse:1072.105591 
## [11] train-rmse:892.479187 
## [12] train-rmse:756.755859 
## [13] train-rmse:648.792053 
## [14] train-rmse:577.898621 
## [15] train-rmse:516.197937 
## [16] train-rmse:467.336212 
## [17] train-rmse:436.486969 
## [18] train-rmse:406.966827 
## [19] train-rmse:381.598633 
## [20] train-rmse:362.346283 
## [21] train-rmse:348.406586 
## [22] train-rmse:339.302521 
## [23] train-rmse:323.668304 
## [24] train-rmse:315.626312 
## [25] train-rmse:308.365356 
## [26] train-rmse:304.067474 
## [27] train-rmse:299.807007 
## [28] train-rmse:294.491669 
## [29] train-rmse:289.260681 
## [30] train-rmse:286.898315 
## [31] train-rmse:275.956177 
## [32] train-rmse:274.439362 
## [33] train-rmse:266.735413 
## [34] train-rmse:261.335022 
## [35] train-rmse:255.965347 
## [36] train-rmse:254.613876 
## [37] train-rmse:250.018661 
## [38] train-rmse:247.678406 
## [39] train-rmse:245.217163 
## [40] train-rmse:240.946564 
## [41] train-rmse:238.491440 
## [42] train-rmse:237.925644 
## [43] train-rmse:237.266998 
## [44] train-rmse:236.705490 
## [45] train-rmse:234.588348 
## [46] train-rmse:233.108276 
## [47] train-rmse:231.601913 
## [48] train-rmse:231.403214 
## [49] train-rmse:230.231857 
## [50] train-rmse:229.325745
print(xgb_0)
## ##### xgb.Booster
## raw: 80.6 Kb 
## call:
##   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
##     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
##     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
##     callbacks = callbacks, booster = "gbtree", objective = "reg:linear")
## params (as set within xgb.train):
##   booster = "gbtree", objective = "reg:linear", silent = "1"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
##   cb.evaluation.log()
## # of features: 30 
## niter: 50
## nfeatures : 30 
## evaluation_log:
##     iter train_rmse
##        1 11027.4297
##        2  8065.8555
## ---                
##       49   230.2319
##       50   229.3257
test_xgb<-predict(xgb_0, newdata = as.matrix(whole_data$test[, !'price', with=F]), type='response')

df_pred<-cbind(df_pred, test_xgb)
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  5 variables:
##  $ id       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price    : num  17450 18920 30760 5151 6377 ...
##  $ test_tree: num  13944 20579 36964 7402 7402 ...
##  $ test_rf  : num  16786 18333 34747 6027 6002 ...
##  $ test_xgb : num  18353 17226 39207 5644 5737 ...
##  - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red','blue','forestgreen'))

#### 2.1 Regression with StepWise feature selection 
library(MASS)

lm_0<-stepAIC(lm(formula = formula, 
                 data=whole_data$train),
              trace=F)

summary(lm_0)
## 
## Call:
## lm(formula = price ~ fuel_gas + body_others + body_wagon + engine_loc_others + 
##     wheel_base + weight + engine_type_others + cyl_others + cyl_six + 
##     engine_size + fuel_sys_mpfi + stroke + compr_ratio + peak_rpm + 
##     city_mpg + make_agg_toyota, data = whole_data$train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5137.6 -1508.6  -169.6  1203.3 10249.6 
## 
## Coefficients:
##                      Estimate Std. Error t value          Pr(>|t|)    
## (Intercept)          8895.278  10865.034   0.819          0.414355    
## fuel_gas           -19493.083   5406.508  -3.605          0.000433 ***
## body_others          2236.205    977.614   2.287          0.023682 *  
## body_wagon          -1264.349    705.093  -1.793          0.075121 .  
## engine_loc_others   11125.224   2172.291   5.121 0.000000992671849 ***
## wheel_base            172.141     61.819   2.785          0.006107 ** 
## weight                  3.787      1.594   2.377          0.018840 *  
## engine_type_others  -4194.023    653.879  -6.414 0.000000002065665 ***
## cyl_others           3458.041   1114.765   3.102          0.002328 ** 
## cyl_six              4392.530    849.152   5.173 0.000000788665742 ***
## engine_size           114.357     13.924   8.213 0.000000000000134 ***
## fuel_sys_mpfi        1349.560    611.260   2.208          0.028895 *  
## stroke              -5585.578    872.272  -6.403 0.000000002179275 ***
## compr_ratio         -1308.497    389.470  -3.360          0.001007 ** 
## peak_rpm                1.545      0.628   2.460          0.015139 *  
## city_mpg              154.041     83.684   1.841          0.067790 .  
## make_agg_toyota     -2195.249    606.064  -3.622          0.000409 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2359 on 139 degrees of freedom
## Multiple R-squared:  0.9186, Adjusted R-squared:  0.9093 
## F-statistic:  98.1 on 16 and 139 DF,  p-value: < 0.00000000000000022
test_lm<-predict(lm_0, newdata = whole_data$test)

df_pred<-cbind(df_pred, test_lm)
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  6 variables:
##  $ id       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price    : num  17450 18920 30760 5151 6377 ...
##  $ test_tree: num  13944 20579 36964 7402 7402 ...
##  $ test_rf  : num  16786 18333 34747 6027 6002 ...
##  $ test_xgb : num  18353 17226 39207 5644 5737 ...
##  $ test_lm  : num  19378 19208 30497 2257 5859 ...
##  - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red','blue','forestgreen','orange'))

#### 2.2 Regression with regularization
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
glmnet_0<-cv.glmnet(x = data.matrix(whole_data$train[, !'price']), 
                 y = whole_data$train[['price']],
                 family = 'gaussian',
                 alpha=1)
glmnet_0<-glmnet(x = data.matrix(whole_data$train[, !'price']), 
                    y = whole_data$train[['price']],
                    family = 'gaussian',
                    alpha=1, lambda = glmnet_0$lambda.min)

glmnet_0
## 
## Call:  glmnet(x = data.matrix(whole_data$train[, !"price"]), y = whole_data$train[["price"]],      family = "gaussian", alpha = 1, lambda = glmnet_0$lambda.min) 
## 
##      Df   %Dev Lambda
## [1,] 22 0.9146   59.9
print(glmnet_0)
## 
## Call:  glmnet(x = data.matrix(whole_data$train[, !"price"]), y = whole_data$train[["price"]],      family = "gaussian", alpha = 1, lambda = glmnet_0$lambda.min) 
## 
##      Df   %Dev Lambda
## [1,] 22 0.9146   59.9
test_glmnet<-predict(glmnet_0, newx = as.matrix(whole_data$test[, !'price']),s = 0)

df_pred<-cbind(df_pred, test_glmnet=test_glmnet[,1])
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  7 variables:
##  $ id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price      : num  17450 18920 30760 5151 6377 ...
##  $ test_tree  : num  13944 20579 36964 7402 7402 ...
##  $ test_rf    : num  16786 18333 34747 6027 6002 ...
##  $ test_xgb   : num  18353 17226 39207 5644 5737 ...
##  $ test_lm    : num  19378 19208 30497 2257 5859 ...
##  $ test_glmnet: num  17943 19133 28520 442 5917 ...
##  - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray'))

#### 2.3 Boosting Regression
library(xgboost)

# for this algorithm we need to convert to a matrix first
# 

xgb_reg_0<-xgboost(booster='gblinear',
               data=as.matrix(whole_data$train[, !'price', with=F]),
               label=whole_data$train$price,
               nrounds = 50,
               objective='reg:linear')
## [1]  train-rmse:4828.959961 
## [2]  train-rmse:4155.766113 
## [3]  train-rmse:3926.983643 
## [4]  train-rmse:3810.656250 
## [5]  train-rmse:3729.830078 
## [6]  train-rmse:3664.711182 
## [7]  train-rmse:3608.852783 
## [8]  train-rmse:3559.350098 
## [9]  train-rmse:3514.524658 
## [10] train-rmse:3473.294922 
## [11] train-rmse:3434.931885 
## [12] train-rmse:3398.937988 
## [13] train-rmse:3364.964600 
## [14] train-rmse:3332.764648 
## [15] train-rmse:3302.157715 
## [16] train-rmse:3273.004395 
## [17] train-rmse:3245.196533 
## [18] train-rmse:3218.645020 
## [19] train-rmse:3193.274658 
## [20] train-rmse:3169.019775 
## [21] train-rmse:3145.822021 
## [22] train-rmse:3123.627686 
## [23] train-rmse:3102.388184 
## [24] train-rmse:3082.056885 
## [25] train-rmse:3062.590820 
## [26] train-rmse:3043.950195 
## [27] train-rmse:3026.094482 
## [28] train-rmse:3008.989502 
## [29] train-rmse:2992.599121 
## [30] train-rmse:2976.889893 
## [31] train-rmse:2961.831787 
## [32] train-rmse:2947.393311 
## [33] train-rmse:2933.547119 
## [34] train-rmse:2920.264893 
## [35] train-rmse:2907.520508 
## [36] train-rmse:2895.290039 
## [37] train-rmse:2883.550049 
## [38] train-rmse:2872.277344 
## [39] train-rmse:2861.450684 
## [40] train-rmse:2851.049316 
## [41] train-rmse:2841.054199 
## [42] train-rmse:2831.446289 
## [43] train-rmse:2822.208984 
## [44] train-rmse:2813.324707 
## [45] train-rmse:2804.776855 
## [46] train-rmse:2796.551270 
## [47] train-rmse:2788.633057 
## [48] train-rmse:2781.008057 
## [49] train-rmse:2773.662598 
## [50] train-rmse:2766.585449
print(xgb_reg_0)
## ##### xgb.Booster
## raw: 520 bytes 
## call:
##   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
##     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
##     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
##     callbacks = callbacks, booster = "gblinear", objective = "reg:linear")
## params (as set within xgb.train):
##   booster = "gblinear", objective = "reg:linear", silent = "1"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
##   cb.evaluation.log()
## # of features: 30 
## niter: 50
## nfeatures : 30 
## evaluation_log:
##     iter train_rmse
##        1   4828.960
##        2   4155.766
## ---                
##       49   2773.663
##       50   2766.585
test_xgb_reg<-predict(xgb_reg_0, newdata = as.matrix(whole_data$test[, !'price', with=F]), type='response')

df_pred<-cbind(df_pred, test_xgb_reg)
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  8 variables:
##  $ id          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price       : num  17450 18920 30760 5151 6377 ...
##  $ test_tree   : num  13944 20579 36964 7402 7402 ...
##  $ test_rf     : num  16786 18333 34747 6027 6002 ...
##  $ test_xgb    : num  18353 17226 39207 5644 5737 ...
##  $ test_lm     : num  19378 19208 30497 2257 5859 ...
##  $ test_glmnet : num  17943 19133 28520 442 5917 ...
##  $ test_xgb_reg: num  23643 21512 26985 8071 6221 ...
##  - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray','palegreen'))

#### 3. Neural Networks
library(nnet)  # simple feed-foward neural network

nnet_0<-nnet(formula = formula, 
                 data=whole_data$train,
                 size=3,skip=T,
                 linout = TRUE)
## # weights:  127
## initial  value 23290727608.253834 
## iter  10 value 11001526686.852222
## iter  20 value 1619905849.222670
## iter  30 value 745089023.626550
## final  value 741405918.436037 
## converged
print(nnet_0)
## a 30-3-1 network with 127 weights
## inputs: fuel_gas aspiration_turbo doors_others doors_two body_others body_sedan body_wagon drive_others drive_rwd engine_loc_others wheel_base length width height weight engine_type_others cyl_others cyl_six engine_size fuel_sys_idi fuel_sys_mpfi fuel_sys_others bore stroke compr_ratio hp peak_rpm city_mpg high_mpg make_agg_toyota 
## output(s): price 
## options were - skip-layer connections  linear output units
summary(nnet_0)
## a 30-3-1 network with 127 weights
## options were - skip-layer connections  linear output units 
##     b->h1    i1->h1    i2->h1    i3->h1    i4->h1    i5->h1    i6->h1 
##     -0.52     -0.21     -0.29     -0.23     -0.01      0.00     -0.52 
##    i7->h1    i8->h1    i9->h1   i10->h1   i11->h1   i12->h1   i13->h1 
##     -0.06      0.01      0.41      0.62     -0.66     -0.60     -0.66 
##   i14->h1   i15->h1   i16->h1   i17->h1   i18->h1   i19->h1   i20->h1 
##      0.05     -0.66     -0.37      0.33      0.20     -0.26     -0.11 
##   i21->h1   i22->h1   i23->h1   i24->h1   i25->h1   i26->h1   i27->h1 
##     -0.41     -0.06      0.01      0.61      0.46      0.02     -0.22 
##   i28->h1   i29->h1   i30->h1 
##     -0.10      0.00     -0.37 
##     b->h2    i1->h2    i2->h2    i3->h2    i4->h2    i5->h2    i6->h2 
##     -0.12     -0.16     -0.33     -0.26      0.70     -0.43     -0.65 
##    i7->h2    i8->h2    i9->h2   i10->h2   i11->h2   i12->h2   i13->h2 
##      0.43      0.13     -0.57     -0.70      0.46     -0.28      0.19 
##   i14->h2   i15->h2   i16->h2   i17->h2   i18->h2   i19->h2   i20->h2 
##     -0.34      0.50     -0.69     -0.06      0.13     -0.62     -0.41 
##   i21->h2   i22->h2   i23->h2   i24->h2   i25->h2   i26->h2   i27->h2 
##      0.29      0.23     -0.05     -0.58     -0.63     -0.31      0.60 
##   i28->h2   i29->h2   i30->h2 
##      0.21      0.17      0.25 
##     b->h3    i1->h3    i2->h3    i3->h3    i4->h3    i5->h3    i6->h3 
##      0.52     -0.58      0.11     -0.42      0.40     -0.04     -0.66 
##    i7->h3    i8->h3    i9->h3   i10->h3   i11->h3   i12->h3   i13->h3 
##      0.24     -0.06     -0.02      0.03     -0.62     -0.11     -0.49 
##   i14->h3   i15->h3   i16->h3   i17->h3   i18->h3   i19->h3   i20->h3 
##      0.44      0.45     -0.43      0.44      0.44      0.03     -0.35 
##   i21->h3   i22->h3   i23->h3   i24->h3   i25->h3   i26->h3   i27->h3 
##      0.04      0.54     -0.29     -0.25     -0.70     -0.45      0.05 
##   i28->h3   i29->h3   i30->h3 
##     -0.68     -0.42     -0.37 
##      b->o     h1->o     h2->o     h3->o     i1->o     i2->o     i3->o 
##  -4445.05      0.53  -4444.69  -4445.79 -11115.82    869.90  -2234.70 
##     i4->o     i5->o     i6->o     i7->o     i8->o     i9->o    i10->o 
##     33.60   2750.63    589.78   -719.46   1150.13   -122.35  10490.01 
##    i11->o    i12->o    i13->o    i14->o    i15->o    i16->o    i17->o 
##    141.57    -42.78    238.12    106.74      3.69  -4246.40   2838.74 
##    i18->o    i19->o    i20->o    i21->o    i22->o    i23->o    i24->o 
##   4409.11    123.36   6669.83   1305.45   -213.10   -103.35  -5569.04 
##    i25->o    i26->o    i27->o    i28->o    i29->o    i30->o 
##  -1208.01     -2.36      1.71     93.81     62.72  -1887.30
test_nnet<-predict(nnet_0, newdata = whole_data$test)

df_pred<-cbind(df_pred, test_nnet=test_nnet[,1])
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  9 variables:
##  $ id          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price       : num  17450 18920 30760 5151 6377 ...
##  $ test_tree   : num  13944 20579 36964 7402 7402 ...
##  $ test_rf     : num  16786 18333 34747 6027 6002 ...
##  $ test_xgb    : num  18353 17226 39207 5644 5737 ...
##  $ test_lm     : num  19378 19208 30497 2257 5859 ...
##  $ test_glmnet : num  17943 19133 28520 442 5917 ...
##  $ test_xgb_reg: num  23643 21512 26985 8071 6221 ...
##  $ test_nnet   : num  20025 19293 30642 1388 5636 ...
##  - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray','palegreen','cornflowerblue'))

###############################

#### model evaluation

result<-data.table(method=c('tree','rf','xgb','lm','glmnet','xgb_reg','nnet'),
                   rmse=sapply(df_pred[,!c('price','id')],function(x) return(rmse(real=df_pred$price, predicted=x))),
                   mae=sapply(df_pred[,!c('price','id')],function(x) return(mae(real=df_pred$price, predicted=x))),
                   mape=sapply(df_pred[,!c('price','id')],function(x) return(mape(real=df_pred$price, predicted=x))))


result
##     method     rmse      mae      mape
## 1:    tree 3928.491 2977.303 0.2239767
## 2:      rf 2249.769 1588.936 0.1245050
## 3:     xgb 3396.981 2212.590 0.1621892
## 4:      lm 3205.921 2319.084 0.1995611
## 5:  glmnet 3095.549 2127.579 0.1758675
## 6: xgb_reg 3001.003 2256.996 0.1656683
## 7:    nnet 3229.050 2330.489 0.1993993
result[which.min(result$rmse)]
##    method     rmse      mae     mape
## 1:     rf 2249.769 1588.936 0.124505
result[which.min(result$mae)]
##    method     rmse      mae     mape
## 1:     rf 2249.769 1588.936 0.124505
result[which.min(result$mape)]
##    method     rmse      mae     mape
## 1:     rf 2249.769 1588.936 0.124505
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  9 variables:
##  $ id          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price       : num  17450 18920 30760 5151 6377 ...
##  $ test_tree   : num  13944 20579 36964 7402 7402 ...
##  $ test_rf     : num  16786 18333 34747 6027 6002 ...
##  $ test_xgb    : num  18353 17226 39207 5644 5737 ...
##  $ test_lm     : num  19378 19208 30497 2257 5859 ...
##  $ test_glmnet : num  17943 19133 28520 442 5917 ...
##  $ test_xgb_reg: num  23643 21512 26985 8071 6221 ...
##  $ test_nnet   : num  20025 19293 30642 1388 5636 ...
##  - attr(*, ".internal.selfref")=<externalptr>
p<-ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray','palegreen','cornflowerblue'))

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(p)